1 Sandbox

a <- c(2, 4, 4, 6)
b <- c(5, 5, 7, 8)
c <- c(9, 9, 9, 8)
d <- c(1, 2, 3, 3)
#row bind four vectors into matrix
mat <- rbind(a, b, c, d)

#view matrix
mat

dist(mat, method="euclidean")
dist(mat, method="manhattan")
dist(mat, method="minkowski")

# generate matrices
set.seed(50)
unitib = gen_random_matrix('uniform', 1000, 10)
normtib = gen_random_matrix('normal', 8, 3)
gtib = gen_random_matrix('2groups', 20, 3, group_split = .2)
zerotib = gen_const_matrix(8,3,0)
onetib = gen_const_matrix(8,3,1)

gtib = gen_random_matrix('3groups', 100, 5)
ugtib = dist_uniq(gtib, 'euclidean')
desc_stats_uniq(ugtib)

gtib = gen_random_matrix('2groups', 100, 2)
ugtib = dist_uniq(gtib, 'euclidean')
print(desc_stats_uniq(ugtib))

tib = gen_random_matrix('normal', 1000, 5)
ugtib = dist_uniq(tib, 'euclidean')
print(desc_stats_uniq(ugtib))

ugtib = dist_uniq(tib, 'euclidean')
ugtib = dist_uniq(tib, 'mahalanobis')
ugtib = dist_uniq(tib, 'manhattan')
desc_stats_uniq(ugtib)

cov(gen_random_matrix('2groups',100,100))
ugtib = dist_uniq(gen_random_matrix('2groups',100,100), 'mahalanobis')
tib = gen_random_matrix('2groups',100,100)
maha_sum_dists = mahalanobis(tib, colMeans(tib), cov(tib), tol=1e-20)

# test uniqueness
tib = unitib

if (F){
  # scale variables 0 1
  #tib = tib %>% mutate_if(is.numeric, range01)
  
}
#cos = tcosine(tib)

#mat = gen_random_matrix('normal', 1000, 100)
#bigdist(as.matrix(mat), 'analysis/dist_matrix.tmp', method='euclidean')
#dist_uniq()

#utib$sum_dists_div = sum_dists / nrow(utib)
#utib$sum_dists_log = log10(utib$sum_dists+1)
#utib$sum_dists_sqrt = sqrt(utib$sum_dists)
#n = 3
#utib$sum_dists_nrt = utib$sum_dists ^ (1 / n)
#utib$uniq_lin = 1/(1+utib$sum_dists)
#utib$uniq_log = 1/(1+utib$sum_dists_log)
#utib$uniq_sqrt = 1/(1+utib$sum_dists_sqrt)
#utib$sum_cos = rowSums(cos)
#utib$sum_cos_pos = utib$sum_cos-min(utib$sum_cos)
#utib$sum_cos_log = log10(utib$sum_cos+1)

#View(round(utib,4))
#summary(utib)

# plot histograms
#tib_long <- utib %>% pivot_longer(colnames(utib)) %>% as_tibble()
#gp1 <- ggplot(tib_long, aes(x = value)) +    # Draw each column as histogram
  #geom_histogram(bins = 10) + 
#  geom_density(adjust = 1/2) + 
#  facet_wrap(~ name, scales = "free_x")
#print(gp1)

# characteristics of uniq to be saved for diagnostic
#x = scale(utib$sum_dists)

#desc_stats(x)

2 Outlier detection

2.1 Univariate OD

This is similar to outlier detection, in the sense of detection of extreme values, and not of measurement errors. For a single variable, there are well-known methods, e.g., boxplots, n StDev, etc.

“Enderlein (1987) goes even further as the author considers outliers as values that deviate so much from other observations one might suppose a different underlying sampling mechanism. … it sometimes makes sense to formally distinguish two classes of outliers: (i) extreme values and (ii) mistakes.” https://statsandr.com/blog/outliers-detection-in-r

2.2 Multivariate OD

In a multi-variate context, the outliers we are interested in are rare combinations of values. Individually, the values of each variable might not be extreme, but the combination appears relatively far from others.

“The adjusted quantile plot function of the mvoutlier package will solve for and order the squared Mahalanobis Distances for the given observations and plot them against the empirical Chi-Squared distribution function of these values.” https://rpubs.com/Treegonaut/301942

# use mvoutlier
indf = as.data.frame(gen_random_matrix('2groups', 20, 3))

outliers <- aq.plot(indf, delta=qchisq(0.975, df = ncol(indf)))

2.3 Mahalanobis distance

The Mahalnobis distance is often used for OD in multivariate data.

“In multivariate space, the Mahalanobis distance is the distance between two points. It’s frequently used to locate outliers in statistical investigations involving several variables.” Source: https://www.r-bloggers.com/2021/08/how-to-calculate-mahalanobis-distance-in-r/

intib = gen_random_matrix('2groups', 20, 4)
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
intib
##               V1            V2            V3           V4
## 1    -0.13025849  8.116370e-04   -0.11276348   -0.7251788
## 2     1.01909069 -1.108523e+00    0.69567901    1.0599319
## 3     1.14296182 -1.474331e+00   -0.95890756    1.6737914
## 4    -1.13575309  3.486901e-01    0.65507423    0.3446379
## 5   645.78606253 -4.269135e+00  313.58368326 1741.0413719
## 6     0.45945009 -2.651702e-01    0.03982584   -0.8793451
## 7   725.21394246  5.915347e+02 -478.08687689 -872.5018347
## 8     0.34917436  4.716314e-01    1.11634642   -0.3634300
## 9    -0.06872595  2.016548e-01   -1.13326720   -1.5170335
## 10    0.13694295 -8.544031e-02    0.64557340   -0.0730710
## 11    1.61725901 -9.421741e-02   -0.82546076    0.2499878
## 12 1359.19763869 -1.955787e+03 1383.97300048  480.4196835
## 13   -1.61904267 -8.494488e-01    0.54579147    0.8802772
## 14    1.89656782  2.749332e-01    1.69684280    0.5566241
## 15   -1.96705776  4.422685e-01    0.66416048   -0.4427148
## 16    0.34203167 -3.098145e-01   -0.49975015   -0.9678432
## 17    0.07727154  3.397869e-01    2.62713683   -1.8864650
## 18 -134.30486248  3.717919e+02  573.90274592 1110.6999778
## 19    2.51136635 -1.263875e+00   -1.24187764   -0.8360734
## 20    0.44651969 -8.462236e-01    0.15884342    0.9790320
# x: input data
# center: indicate the mean vector of the distribution
# cov: indicate the covariance matrix of the distribution
intib$maha_dist = mahalanobis(intib, colMeans(intib), cov(intib))

# The p-value is the Chi-Square statistic of the
# Mahalanobis distance with k-1 degrees of freedom, 
# where k is the number of variables.
intib$maha_pvalue <- pchisq(intib$maha_dist, df=ncol(intib), lower.tail=F)
#View(intib)
intib$maha_dist
##  [1]  0.2390106  0.2369317  0.2437489  0.2384861 18.0493579  0.2376424
##  [7] 18.0497306  0.2324304  0.2414791  0.2361890  0.2372170 18.0499471
## [13]  0.2439111  0.2271519  0.2403292  0.2398872  0.2288128 18.0486769
## [19]  0.2395496  0.2395106
intib
##               V1            V2            V3           V4  maha_dist
## 1    -0.13025849  8.116370e-04   -0.11276348   -0.7251788  0.2390106
## 2     1.01909069 -1.108523e+00    0.69567901    1.0599319  0.2369317
## 3     1.14296182 -1.474331e+00   -0.95890756    1.6737914  0.2437489
## 4    -1.13575309  3.486901e-01    0.65507423    0.3446379  0.2384861
## 5   645.78606253 -4.269135e+00  313.58368326 1741.0413719 18.0493579
## 6     0.45945009 -2.651702e-01    0.03982584   -0.8793451  0.2376424
## 7   725.21394246  5.915347e+02 -478.08687689 -872.5018347 18.0497306
## 8     0.34917436  4.716314e-01    1.11634642   -0.3634300  0.2324304
## 9    -0.06872595  2.016548e-01   -1.13326720   -1.5170335  0.2414791
## 10    0.13694295 -8.544031e-02    0.64557340   -0.0730710  0.2361890
## 11    1.61725901 -9.421741e-02   -0.82546076    0.2499878  0.2372170
## 12 1359.19763869 -1.955787e+03 1383.97300048  480.4196835 18.0499471
## 13   -1.61904267 -8.494488e-01    0.54579147    0.8802772  0.2439111
## 14    1.89656782  2.749332e-01    1.69684280    0.5566241  0.2271519
## 15   -1.96705776  4.422685e-01    0.66416048   -0.4427148  0.2403292
## 16    0.34203167 -3.098145e-01   -0.49975015   -0.9678432  0.2398872
## 17    0.07727154  3.397869e-01    2.62713683   -1.8864650  0.2288128
## 18 -134.30486248  3.717919e+02  573.90274592 1110.6999778 18.0486769
## 19    2.51136635 -1.263875e+00   -1.24187764   -0.8360734  0.2395496
## 20    0.44651969 -8.462236e-01    0.15884342    0.9790320  0.2395106
##    maha_pvalue
## 1  0.998635538
## 2  0.998664031
## 3  0.998569308
## 4  0.998642760
## 5  0.002885177
## 6  0.998654328
## 7  0.002884720
## 8  0.998724549
## 9  0.998601258
## 10 0.998674126
## 11 0.998660141
## 12 0.002884454
## 13 0.998567010
## 14 0.998793478
## 15 0.998617287
## 16 0.998623421
## 17 0.998772026
## 18 0.002886014
## 19 0.998628095
## 20 0.998628634

2.4 Sum of distances

This shows the behaviour of the sum of distances on clustered data. The visualisation shows the distances of each point to all other points.

gauss_mix = generate_gaussian_mix(3, # number of clusters
  list(sample(1:10, 2), sample(1:10, 2), sample(1:20, 2)), # centres
  sample(1:40, 3)) # sizes

# calculate sum of all distances for each point
gauss_mix$dist_sum = dist(gauss_mix, diag = T, upper = T) %>% as.matrix() %>% rowSums()
gauss_mix$dist_sum_z = zscore(gauss_mix$dist_sum)
# plot results

# scatter plot
ggplot(gauss_mix, aes(x=x, y=y)) + geom_point(aes(colour=dist_sum, size=log(dist_sum))) + 
  scale_colour_gradient() + ggtitle('Sum of distances') + 
  theme_bw()

# scatter plot z score
ggplot(gauss_mix, aes(x=x, y=y)) + geom_point(aes(colour=dist_sum_z, size=dist_sum_z)) + 
   scale_colour_distiller(palette='RdYlBu',direction=1) + 
  ggtitle('Sum of distances (z score)') + 
  theme_bw()

# histogram
ggplot(gauss_mix, aes(dist_sum)) + geom_histogram(bins = 20)

3 Uniqueness

3.1 Univariate uniqueness

Uniqueness can be thought of as \(1 - p\), where p is the probability of encountering a particular observation from random extractions from a set.

For example, these are the percentages of land cover categories in the UK: Farmland 56.7%, Natural 34.9%, Green urban 2.5%, Built on 5.9%. Taking a probabilistic view, we can define the uniqueness of a category as 1-p. The rarest category we would encounter by selecting a random area in the UK is green urban (\(U = .98\)).

Data source: https://www.eea.europa.eu/publications/COR0-landcover

uk_landcover = tibble(cat=c('farm','natural','green urban','built'), pc=c(56.7, 34.9, 2.5, 5.9))
uk_landcover$p = uk_landcover$pc/100
uk_landcover$uniq = 1 - uk_landcover$p

uk_landcover
## # A tibble: 4 x 4
##   cat            pc     p  uniq
##   <chr>       <dbl> <dbl> <dbl>
## 1 farm         56.7 0.567 0.433
## 2 natural      34.9 0.349 0.651
## 3 green urban   2.5 0.025 0.975
## 4 built         5.9 0.059 0.941

To make it more interpretable, we can use the z score. This way, more unique observations emerge from the data.

uk_landcover$uniq_z = round(zscore(uk_landcover$uniq),2)

uk_landcover
## # A tibble: 4 x 5
##   cat            pc     p  uniq uniq_z[,1]
##   <chr>       <dbl> <dbl> <dbl>      <dbl>
## 1 farm         56.7 0.567 0.433      -1.24
## 2 natural      34.9 0.349 0.651      -0.39
## 3 green urban   2.5 0.025 0.975       0.88
## 4 built         5.9 0.059 0.941       0.74

We can think of uniqueness as a deviation from the uniform distribution. If all types of observation occur at the same probability, it is not possible to calculate a meaningful uniqueness score (\(z\) is NA).

uniform_tib = tibble(cat=c('cat1','cat2','cat3','cat4'), pc=1/4, uniq=1-1/4)
uniform_tib$uniq_z = zscore(uniform_tib$uniq)
print(uniform_tib)
## # A tibble: 4 x 4
##   cat      pc  uniq uniq_z[,1]
##   <chr> <dbl> <dbl>      <dbl>
## 1 cat1   0.25  0.75        NaN
## 2 cat2   0.25  0.75        NaN
## 3 cat3   0.25  0.75        NaN
## 4 cat4   0.25  0.75        NaN

We can use measures of entropy to see if the data has heterogeneity that can be used to quantify uniqueness. High entropy means low heterogeneity (no observation will be unique) and vice-versa.

print(paste('Entropy of uniform data', round(entropy(uniform_tib$pc),2)))
## [1] "Entropy of uniform data 1.39"
print(paste('Entropy of heterogeneous data', round(entropy(uk_landcover$pc),2)))
## [1] "Entropy of heterogeneous data 0.95"

3.2 Multi-variate uniqueness

The multivariate case is more interesting and challenging. It is useful for exploratory data analysis (EDA).

In this example, we consider four observations along two dimensions. The geometric interpretation of uniqueness of observation \(a\) in this case is \(u_a = \sum dist(a,b)\). To make the results more interpretable, we can obtain the \(z\) scores. It is possible to observe visually that point 1 is the most spatially isolated, while point 2 is the most central to the dataset and therefore has the lowest uniqueness.

3.2.1 Minimal example

multiv_tib = as_tibble(matrix(c(-1,0,1,0,-1,0,1,1), ncol = 2))
print(multiv_tib)
## # A tibble: 4 x 2
##      V1    V2
##   <dbl> <dbl>
## 1    -1    -1
## 2     0     0
## 3     1     1
## 4     0     1
# plot
ggplot(multiv_tib, aes(x=V1, y=V2, label=rownames(multiv_tib))) + theme(aspect.ratio = 1) +
  geom_point(size=3, colour='blue', fill='blue') + geom_text(nudge_y = .1) + 
  ggtitle('Location of points')

# get distance matrix
dist_mat = as.matrix(dist(multiv_tib, diag = T, upper = T))
print(dist_mat)
##          1        2        3        4
## 1 0.000000 1.414214 2.828427 2.236068
## 2 1.414214 0.000000 1.414214 1.000000
## 3 2.828427 1.414214 0.000000 1.000000
## 4 2.236068 1.000000 1.000000 0.000000
# sum distances
multiv_tib$sum_dist = round(rowSums(dist_mat),2)
multiv_tib$sum_dist_z = round(zscore(multiv_tib$sum_dist),2)

# plot uniqueness
ggplot(multiv_tib, aes(x=V1, y=V2, label=sum_dist)) + theme(aspect.ratio = 1) +
  geom_point(size=3, colour='green', fill='green') + geom_text(nudge_y = .1) + 
  ggtitle('Sum of distances to other points')

# plot z scores of uniqueness 
ggplot(multiv_tib, aes(x=V1, y=V2, label=sum_dist_z)) + theme(aspect.ratio = 1) +
  geom_point(size=3, colour='green', fill='green') + geom_text(nudge_y = .1) + 
  ggtitle('Sum of distances to other points (z scores)')

3.2.2 Distance-based uniq

Define uniqueness functions

#' Descriptive statistics
#' @return tibble with a row with descriptive stats about x (z scores).
desc_stats_uniq <- function(utib){
  x = utib$sum_dists_z
  shap_x = x
  if (nrow(utib) > 5000){
    shap_x = sample(utib, 5000)
  }
  
  uclasses = table(utib$sum_dists_class)/nrow(utib)*100
  
  tibble(
    n = length(x),
    min = min(x),
    med = median(x),
    mean = mean(x),
    max = max(x),
    pc_very_common = uclasses['very common'],
    pc_common = uclasses['common'],
    pc_inbet = uclasses['medium'],
    pc_rare = uclasses['rare'],
    pc_very_rare = uclasses['very rare'],
    skewness = round(skewness(x),3),
    iqr = round(IQR(x),3),
    shapiro_w = shapiro.test(shap_x)$stat,
    shapiro_p = round(shapiro.test(shap_x)$p,5)
  )
}

#' Find a sample size to produce a distance matrix that is manageable in memory
find_sample_sz = function(n_row, n_col, max_dist_size){
  while(((n_col*n_row)^2) > max_dist_size){
    n_row = n_row - 1
  }
  return(n_row)
}

#' Classify p values returned by the uniqueness function with a 
#' human readable label.
classify_uniq = function(p_values){
  breaks = c(0, .001, .01, .05, .1, Inf)
  labels = c('very rare','rare','medium','common','very common')
  classif = cut(p_values, breaks, labels)
  #print(summary(classif)/length(p_values)*100)
  return(classif)
}

#' Classify p values returned by the uniqueness function with a 
#' human readable label.
classify_p_values = function(p_values){
  breaks = c(0, .001, .01, .05, .1, Inf)
  labels = c('***','**','*','.','')
  classif = cut(p_values, breaks, labels)
  #print(summary(classif)/length(p_values)*100)
  return(classif)
}

#' Calculate uniqueness based on distance in multi-dimensional space.
#' Every numeric column is scaled.
#' 
#' @param tib
#' @param dist_method: 'euclidean','mahalanobis','manhattan', 'minkowski', 'canberra'
#' @return tib with uniqueness
dist_uniq = function(tib, dist_method, scale_cols=T, impute_na=F){
  print(paste('dist_uniq', nrow(tib), ncol(tib), dist_method))
  if (impute_na){
    print('imputing missing values...')
    # impute missing values
    tib = impute_numeric_cols_median(tib)
  }
  if (scale_cols){
    print('scaling data...')
    # scale input variables mean + sd
    tib = tib %>% mutate_if(is.numeric, scale)
  }
  stopifnot(dist_method %in% c('euclidean','manhattan', 'minkowski','canberra','mahalanobis'))
  
  # check memory limit for distance matrix
  max_matrix_sz = 1e8
  #print(dim(tib))
  sample_rows = find_sample_sz(nrow(tib), ncol(tib), max_matrix_sz)
  if (nrow(tib) > sample_rows){
    warning(paste("Distance matrix would be too large, using a sample N =",sample_rows))
    tib = tib %>% sample_n(sample_rows)
  }
  # calculate distances
  num_tib = tib %>% select_if(is.numeric)
  if (dist_method == 'mahalanobis'){
    # tolerance has to be set to avoid rounding errors
    sum_dists = mahalanobis(num_tib, colMeans(num_tib, na.rm = T), 
                            cov(num_tib, use="complete.obs"), tol=1e-20)
  } else {
    # all other distances
    sum_dists = rowSums(as_tibble(as.matrix(dist(num_tib, 
                    diag = T, upper = T, method = dist_method))))
  }
  utib = tib
  # sum of distances
  utib$sum_dists = sum_dists
  # deciles
  utib$sum_dists_dec = ntile(-utib$sum_dists, 10)
  # z score of distances
  utib$sum_dists_z = as.numeric(zscore(sum_dists))
  # calculate p value of z score: 
  # we are interested to check if an observation is higher than expected (lower.tail=F) 
  utib$sum_dists_z_p = pnorm(q=utib$sum_dists_z, mean = mean(utib$sum_dists_z, na.rm=T), 
                 sd = sd(utib$sum_dists_z, na.rm=T), lower.tail=F)
  utib$sum_dists_class = classify_uniq(utib$sum_dists_z_p)
  utib$sum_dists_p_thresh = classify_p_values(utib$sum_dists_z_p)
  
  # discarded p values
  #utib$sum_dists_z_p = 2*pnorm(q=utib$sum_dists_z, lower.tail=FALSE)
  #utib$sum_dists_pvalue <- pchisq(utib$sum_dists_z, df=ncol(tib), lower.tail=F)
  utib$dist_method = dist_method
  # check results
  n_na = sum(is.na(utib$sum_dists_class))
  if (n_na > 0){
    warning(paste('distances could not be calculated for',n_na,
                  'cases. Set impute_na to TRUE to impute missing values.'))
  }
  as_tibble(utib) %>% arrange(-sum_dists_z)
}

#' Main function to calculate uniqueness 
#' @tib input data tibble, including numeric and non-numeric columns
#' @return a tibble with results and the similarity matrix between all rows.
OLD_calc_uniqueness_ = function(tib, sim_method, handle_missing_values=T){
  # get only numeric columns
  indata = tib %>% select_if(is.numeric)
  stopifnot(ncol(indata) > 1)
  
  stopifnot(sim_method %in% c('cosine', "euclidean", "maximum", "manhattan",
                              "canberra", "binary", "minkowski"))
  
  # calculate entropy for each row: 
  #   high = uniform data, low = some attribute high/low
  #entropies = apply(X = indata, MARGIN = 1, FUN = function(x) entropy(x[!is.na(x)]))
  
  if (sim_method == 'cosine'){
    # scale columns between 0 and 1
    scaled_indata = indata %>% mutate_if(is.numeric, range01)
    # calculate cosine similarities between rows
    sim_matrix = tcosine(scaled_indata) %>% as_tibble()
  } else {
    stopifnot(sim_method %in% c("euclidean", "maximum", "manhattan", "canberra", 
              "binary", "minkowski"))
    # calculate cosine similarities between rows
    edist = dist(indata, diag=T, upper=T, method=sim_method) %>% as.matrix() %>% as_tibble()
    stopifnot(edist >= 0)
    sim_matrix = 1/(1+edist)
  }
  #View(sim_matrix)
  stopifnot(sim_matrix >= 0 & sim_matrix <= 1)
  
  if (handle_missing_values){
    # calculate cosine for cases with missing values
    missing_cases = rowSums(is.na(sim_matrix))
    missing_cases_row_idx = which(missing_cases == (ncol(sim_matrix)-1))
    #print(paste0('Missing cases N = ', length(missing_cases_row_idx)))
    for (row_idx in missing_cases_row_idx){
      for (other_row_idx in seq(nrow(indata))){
        non_null_pair = indata[c(row_idx,other_row_idx),] %>% 
          select_if(is.numeric) %>% select_if(~ !any(is.na(.)))
        n_pair_vars = ncol(non_null_pair)
        pair_cosine = tcosine(non_null_pair)
        sim_matrix[row_idx, other_row_idx] = pair_cosine[1,2]
        sim_matrix[other_row_idx, row_idx] = pair_cosine[2,1]
      }
    }
  }
  uniq_df = tib
  uniq_df$na_count = rowSums(is.na(indata))
  # calculate uniqueness
  uniq_df$sim_sum = rowSums(sim_matrix, na.rm=T)-1
  uniq_df$sim_sum[uniq_df$sim_sum == 0] <- NA
  uniq_df$uniq = 1 - (uniq_df$sim_sum / nrow(uniq_df))
  uniq_df$uniq_rank = rank(-uniq_df$uniq)
  uniq_df$uniq_z = z_score(uniq_df$uniq)
  uniq_df$uniq_dist_avg = uniq_df$uniq - mean(uniq_df$uniq)
  # save entropy in results
  #uniq_df$entropy = entropies
  #View(uniq_df)
  #print(summary(uniq_df$uniq))
  stopifnot(uniq_df$uniq >= 0 & uniq_df$uniq <= 1)
  return(list(uniqueness=uniq_df, sim_matrix=sim_matrix))
}

3.2.3 Experiment with high dim

On high dimensional data, the interpretation of \(u\) is less immediate. Therefore, it is useful to observe the behaviour of the uniqueness measure \(u\) through a Monte Carlo method.

A core idea is that data with uniformly and normally distributed variables can still produce high values of \(u\), generating false positives in the search for unique observations. The Shapiro-Wilk test on \(u\) consistently returns low W on heterogeneous distributions with actual groups and high W on homogeneous data, regardless of scale.

3.2.3.1 Generate synth data

#t = gen_random_matrix('normal', 10, 5)
#u = calc_uniqueness(t, 'euclidean')
#View(u$uniqueness)
#ggplot(u$uniqueness, aes(x=uniq_dist_avg)) + xlab("uniq_dist_avg") + 
#  geom_histogram(color="white", fill="lightgreen", bins = 10)#
#ggplot(u$uniqueness, aes(x=uniq)) + #xlab("uniq_dist_avg") + 
#  geom_histogram(color="white", fill="gray", bins = 10)

if (T){
  # synthetic data
  res = tibble()
  set.seed(NULL)
  for (rand_data in c('normal','uniform','2groups','3groups'))
  for (dist_method in c('euclidean','manhattan','minkowski','mahalanobis'))
  for (data_scale in c(1,100,1000))
  for (row_n in c(10,100,1000,10000))
  for (col_n in c(2,10,20))
  for (trial in seq(10))
  {
    if (row_n == 0 | col_n == 0) next()
    #if (dist_method != 'mahalanobis') next()
    rand_tib = gen_random_matrix(rand_data, row_n, col_n, data_scale)
    utib = dist_uniq(rand_tib, dist_method)
    res_row = desc_stats_uniq(utib)
    res_row = res_row %>% add_column(distrib = rand_data, 
                             dist_method = dist_method,
                             data_scale = data_scale,
                             row_n = row_n, 
                             col_n = col_n, 
                             sample_n = nrow(utib),
                             trial = trial,
                             .before = "n")
    res = bind_rows(res, res_row)
  }
  
  saveRDS(res, 'analysis/high_dimensional_experiment_1.rds')
  openxlsx::write.xlsx(res, 'analysis/high_dimensional_experiment_1.xlsx')
}

3.2.3.2 Analyse results

Summarise experiment 1 results to observe trends in \(u\) in different settings.

res_tib = readRDS('analysis/high_dimensional_experiment_1.rds') 
  #%>% #mutate_if(is.character, factor)
print(nrow(res_tib))
## [1] 5760
summary_res = tibble()

for (cols in list(c('row_n'), c('col_n'), 
    c('distrib','row_n'), c('distrib','col_n'), c('distrib'),
    c('dist_method'), c('data_scale'),
    c('distrib','dist_method'),
    c('distrib','dist_method','data_scale'))){
  print(cols)
  cols_str = paste(cols, collapse = ' ')
  res_stats_tib = res_tib %>% 
    #group_by(distrib, dist_method) %>% 
    group_by(across(all_of(cols))) %>%  # all_of
    dplyr::summarise(cols = cols_str, 
      n = n(), 
      mn_dist_min = mean(min),
      mn_dist_max = mean(max),
      mn_vcommon = mean(pc_very_common),
      mn_common = mean(pc_common),
      mn_inbet = mean(pc_inbet),
      mn_rare = mean(pc_rare),
      mn_vrare = mean(pc_very_rare),
      mn_skew = mean(skewness)) %>% 
    mutate_if(is.numeric, round, 2)
  
  summary_res = bind_rows(summary_res, res_stats_tib)
}
## [1] "row_n"
## [1] "col_n"
## [1] "distrib" "row_n"
## `summarise()` has grouped output by 'distrib'. You can override using the `.groups` argument.
## `mutate_if()` ignored the following grouping variables:
## Column `distrib`
## [1] "distrib" "col_n"
## `summarise()` has grouped output by 'distrib'. You can override using the `.groups` argument.
## `mutate_if()` ignored the following grouping variables:
## Column `distrib`
## [1] "distrib"
## [1] "dist_method"
## [1] "data_scale"
## [1] "distrib"     "dist_method"
## `summarise()` has grouped output by 'distrib'. You can override using the `.groups` argument.
## `mutate_if()` ignored the following grouping variables:
## Column `distrib`
## [1] "distrib"     "dist_method" "data_scale"
## `summarise()` has grouped output by 'distrib', 'dist_method'. You can override using the `.groups` argument.
## `mutate_if()` ignored the following grouping variables:
## Columns `distrib`, `dist_method`
summary_res
## # A tibble: 110 x 15
##    row_n cols            n mn_dist_min mn_dist_max mn_vcommon mn_common mn_inbet
##    <dbl> <chr>       <dbl>       <dbl>       <dbl>      <dbl>     <dbl>    <dbl>
##  1    10 row_n        1440       -1.03        2.01       87.9      3.38     5.68
##  2   100 row_n        1440       -1.16        3.37       88.3      3.17     4.77
##  3  1000 row_n        1440       -1.28        4.58       88.6      3.2      4.41
##  4 10000 row_n        1440       -1.28        4.96       88.7      3.19     4.38
##  5    NA col_n        1920       -0.85        4.77       89.0      3.05     4.31
##  6    NA col_n        1920       -1.32        3.45       88.2      3.32     4.71
##  7    NA col_n        1920       -1.39        2.98       87.8      3.33     5.41
##  8    10 distrib ro…   360       -0.68        2.09       83.6      2.64    11.2 
##  9   100 distrib ro…   360       -0.46        3.38       84.2      2.89     8.61
## 10  1000 distrib ro…   360       -0.46        4.72       84.8      3.23     7.24
## # … with 100 more rows, and 7 more variables: mn_rare <dbl>, mn_vrare <dbl>,
## #   mn_skew <dbl>, col_n <dbl>, distrib <chr>, dist_method <chr>,
## #   data_scale <dbl>

Impact of variables on distribution of uniqueness values.

Different distributions generate different common/rare distribution. 1,440 cases observed for each combination. Uniform and normally-distributed data generate a similar pattern, with normal data with more having more rare/very rare observations (2%, 1.1%). Uneven distributions with groups, as expected by their design, generate more rare/very rare observations (4.1% and 7.3%), having a large central group of close observations and relatively many distant observations:

Using the Dunn’s test, the only two distributions that are not significantly different are the normal and the uniform, which generate similar uniqueness patterns (TODO: why?).

# impact of data distribution
#summary_res %>% filter(cols == 'distrib') %>% dplyr::select(distrib, n, mn_dist_min, mn_dist_max, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% flextable()
summary_results_tib = summary_res

# compare groups by number of rows
tib = summary_results_tib %>% filter(cols == 'distrib') %>% 
  dplyr::select(distrib, n, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% unique()
tib %>% flextable()
all_vals = tibble()
u_labels = c('vcommon','common','inbet','rare','vrare')
for(i in tib$distrib){
  pcs = tib %>% filter(distrib == i) %>% 
    dplyr::select(mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% t() %>% as.vector()
  sz = tib %>% filter(distrib == i) %>% dplyr::select(n) %>% .[[1]]
  vals = generate_cat_data_from_pc(pcs, u_labels, sz)
  row_tib = tibble(group = i, val = vals)
  all_vals = bind_rows(all_vals, row_tib)
}

# Dunn's test to check if the groups are different (non-parametric)
res_dunn = all_vals %>% rstatix::dunn_test(val ~ group)
res_dunn
## # A tibble: 6 x 9
##   .y.   group1  group2     n1    n2 statistic        p    p.adj p.adj.signif
## * <chr> <chr>   <chr>   <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
## 1 val   2groups 3groups  1438  1436     8.14  3.88e-16 2.33e-15 ****        
## 2 val   2groups normal   1438  1437     3.03  2.44e- 3 7.33e- 3 **          
## 3 val   2groups uniform  1438  1438     2.50  1.25e- 2 2.50e- 2 *           
## 4 val   3groups normal   1436  1437    -5.11  3.19e- 7 1.28e- 6 ****        
## 5 val   3groups uniform  1436  1438    -5.65  1.65e- 8 8.23e- 8 ****        
## 6 val   normal  uniform  1437  1438    -0.533 5.94e- 1 5.94e- 1 ns

The size of the dataset (n observations x n variables) has minimal impact on the behaviour of \(u\). Varying the number of observations from 10 to 10,000, it is possible to observe that with very few observations (10) it is less likely to obtain very rare observations, as expected. The other three rows converge and do not have significant differences (using Dunn’s test):

Rows:

# compare groups by number of rows
tib = summary_results_tib %>% filter(cols == 'row_n') %>% 
  dplyr::select(row_n, n, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare)
all_vals = tibble()
u_labels = c('vcommon','common','inbet','rare','vrare')
for(i in tib$row_n){
  pcs = tib %>% filter(row_n == i) %>% 
    dplyr::select(mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% t() %>% as.vector()
  sz = tib %>% filter(row_n == i) %>% dplyr::select(n) %>% .[[1]]
  vals = generate_cat_data_from_pc(pcs, u_labels, sz)
  row_tib = tibble(group = i, val = vals)
  all_vals = bind_rows(all_vals, row_tib)
}

# Dunn's test to check if the groups are different (non-parametric)
res_dunn = all_vals %>% rstatix::dunn_test(val ~ group)
res_dunn
## # A tibble: 6 x 9
##   .y.   group1 group2    n1    n2 statistic       p  p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl>   <dbl>  <dbl> <chr>       
## 1 val   10     100     1438  1438     2.34  0.0190  0.0761 ns          
## 2 val   10     1000    1438  1439     2.68  0.00736 0.0368 *           
## 3 val   10     10000   1438  1437     2.78  0.00537 0.0322 *           
## 4 val   100    1000    1438  1439     0.335 0.738   1      ns          
## 5 val   100    10000   1438  1437     0.439 0.660   1      ns          
## 6 val   1000   10000   1439  1437     0.105 0.916   1      ns

Columns do not have impact on the results:

# compare groups by number of rows
tib = summary_results_tib %>% filter(cols == 'col_n') %>% 
  dplyr::select(col_n, n, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare)
all_vals = tibble()
u_labels = c('vcommon','common','inbet','rare','vrare')
for(i in tib$col_n){
  print(i)
  pcs = tib %>% filter(col_n == i) %>% 
    dplyr::select(mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% t() %>% as.vector()
  sz = tib %>% filter(col_n == i) %>% dplyr::select(n) %>% .[[1]]
  print(paste('generating synth data:',sz))
  vals = generate_cat_data_from_pc(pcs, u_labels, sz)
  row_tib = tibble(group = i, val = vals)
  all_vals = bind_rows(all_vals, row_tib)
}
## [1] 2
## [1] "generating synth data: 1920"
## [1] 10
## [1] "generating synth data: 1920"
## [1] 20
## [1] "generating synth data: 1920"
# Dunn's test to check if the groups are different (non-parametric)
res_dunn = all_vals %>% rstatix::dunn_test(val ~ group)
res_dunn
## # A tibble: 3 x 9
##   .y.   group1 group2    n1    n2 statistic      p  p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl>  <dbl>  <dbl> <chr>       
## 1 val   2      10      1917  1917    -1.49  0.137  0.274  ns          
## 2 val   2      20      1917  1916    -2.23  0.0260 0.0779 ns          
## 3 val   10     20      1917  1916    -0.740 0.460  0.460  ns

The four distance methods do not generate significantly different proportions of \(u\) results.

# compare distance metrics
tib = summary_results_tib %>% filter(cols == 'dist_method') %>% 
  dplyr::select(dist_method, n, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% unique()
all_vals = tibble()
u_labels = c('vcommon','common','inbet','rare','vrare')
for(i in tib$dist_method){
  pcs = tib %>% filter(dist_method == i) %>% 
    dplyr::select(mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% t() %>% as.vector()
  sz = tib %>% filter(dist_method == i) %>% dplyr::select(n) %>% .[[1]]
  vals = generate_cat_data_from_pc(pcs, u_labels, sz)
  row_tib = tibble(group = i, val = vals)
  all_vals = bind_rows(all_vals, row_tib)
}

# Dunn's test to check if the groups are different (non-parametric)
res_dunn = all_vals %>% rstatix::dunn_test(val ~ group)
res_dunn
## # A tibble: 6 x 9
##   .y.   group1      group2         n1    n2 statistic     p p.adj p.adj.signif
## * <chr> <chr>       <chr>       <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 val   euclidean   mahalanobis  1437  1437    1.01   0.313     1 ns          
## 2 val   euclidean   manhattan    1437  1437   -0.111  0.912     1 ns          
## 3 val   euclidean   minkowski    1437  1437   -0.128  0.898     1 ns          
## 4 val   mahalanobis manhattan    1437  1437   -1.12   0.263     1 ns          
## 5 val   mahalanobis minkowski    1437  1437   -1.14   0.256     1 ns          
## 6 val   manhattan   minkowski    1437  1437   -0.0174 0.986     1 ns

Data scale don’t make any difference to the results either.

# compare data scale
tib = summary_results_tib %>% filter(cols == 'data_scale') %>% 
  dplyr::select(data_scale, n, mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% unique()
all_vals = tibble()
u_labels = c('vcommon','common','inbet','rare','vrare')
for(i in tib$data_scale){
  pcs = tib %>% filter(data_scale == i) %>% 
    dplyr::select(mn_vcommon, mn_common, mn_inbet, mn_rare, mn_vrare) %>% t() %>% as.vector()
  sz = tib %>% filter(data_scale == i) %>% dplyr::select(n) %>% .[[1]]
  vals = generate_cat_data_from_pc(pcs, u_labels, sz)
  row_tib = tibble(group = i, val = vals)
  all_vals = bind_rows(all_vals, row_tib)
}

# Dunn's test to check if the groups are different (non-parametric)
res_dunn = all_vals %>% rstatix::dunn_test(val ~ group)
res_dunn
## # A tibble: 3 x 9
##   .y.   group1 group2    n1    n2 statistic     p p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>       
## 1 val   1      100     1918  1917    0.248  0.805     1 ns          
## 2 val   1      1000    1918  1918    0.297  0.766     1 ns          
## 3 val   100    1000    1917  1918    0.0499 0.960     1 ns

4 Case studies

4.1 London (land use and POIs)

TODO

4.2 London boroughs

TODO: Boroughs of Greater London with socio-economic variables.

4.2.1 Prep data

lnd_boro = st_read('data/london/london_borough_profiles_2015/london_boroughs_profiles_2015.geojson')
## Reading layer `london_boroughs_profiles_2015' from data source `/Users/andreaballatore/Dropbox/DRBX_Docs/Work/Projects/github_projects/calculating-uniqueness/data/london/london_borough_profiles_2015/london_boroughs_profiles_2015.geojson' using driver `GeoJSON'
## Simple feature collection with 33 features and 89 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -0.5103751 ymin: 51.28676 xmax: 0.3340156 ymax: 51.69187
## geographic CRS: WGS 84
# select variables
print(names(lnd_boro))
##  [1] "fid"                                                                                          
##  [2] "gss"                                                                                          
##  [3] "name"                                                                                         
##  [4] "hectares"                                                                                     
##  [5] "nonld_area"                                                                                   
##  [6] "ons_inner"                                                                                    
##  [7] "area_name"                                                                                    
##  [8] "inner_outer_london"                                                                           
##  [9] "gla_population_estimate_2017"                                                                 
## [10] "gla_household_estimate_2017"                                                                  
## [11] "inland_area_hectares"                                                                         
## [12] "population_density_per_hectare_2017"                                                          
## [13] "average_age_2017"                                                                             
## [14] "proportion_of_population_aged_0_15_2015"                                                      
## [15] "proportion_of_population_of_working_age_2015"                                                 
## [16] "proportion_of_population_aged_65_and_over_2015"                                               
## [17] "net_internal_migration_2015"                                                                  
## [18] "net_international_migration_2015"                                                             
## [19] "net_natural_change_2015"                                                                      
## [20] "pc_of_resident_population_born_abroad_2015"                                                   
## [21] "largest_migrant_population_by_country_of_birth_2011"                                          
## [22] "pc_of_largest_migrant_population_2011"                                                        
## [23] "second_largest_migrant_population_by_country_of_birth_2011"                                   
## [24] "pc_of_second_largest_migrant_population_2011"                                                 
## [25] "third_largest_migrant_population_by_country_of_birth_2011"                                    
## [26] "pc_of_third_largest_migrant_population_2011"                                                  
## [27] "pc_of_population_from_bame_groups_2016"                                                       
## [28] "pc_people_aged_3_whose_main_language_is_not_english_2011_census"                              
## [29] "overseas_nationals_entering_the_uk_nino_2015_16"                                              
## [30] "new_migrant_nino_rates_2015_16"                                                               
## [31] "largest_migrant_population_arrived_during_2015_16"                                            
## [32] "second_largest_migrant_population_arrived_during_2015_16"                                     
## [33] "third_largest_migrant_population_arrived_during_2015_16"                                      
## [34] "employment_rate_pc_2015"                                                                      
## [35] "male_employment_rate_2015"                                                                    
## [36] "female_employment_rate_2015"                                                                  
## [37] "unemployment_rate_2015"                                                                       
## [38] "youth_unemployment_claimant_rate_18_24_dec_15"                                                
## [39] "proportion_of_16_18_year_olds_who_are_neet_pc_2014"                                           
## [40] "proportion_of_the_working_age_population_who_claim_out_of_work_benefits_pc_may_2016"          
## [41] "pc_working_age_with_a_disability_2015"                                                        
## [42] "proportion_of_working_age_people_with_no_qualifications_pc_2015"                              
## [43] "proportion_of_working_age_with_degree_or_equivalent_and_above_pc_2015"                        
## [44] "gross_annual_pay_2016"                                                                        
## [45] "gross_annual_pay_male_2016"                                                                   
## [46] "gross_annual_pay_female_2016"                                                                 
## [47] "modelled_household_median_income_estimates_2012_13"                                           
## [48] "pc_adults_that_volunteered_in_past_12_months_2010_11_to_2012_13"                              
## [49] "number_of_jobs_by_workplace_2014"                                                             
## [50] "pc_of_employment_that_is_in_public_sector_2014"                                               
## [51] "jobs_density_2015"                                                                            
## [52] "number_of_active_businesses_2015"                                                             
## [53] "two_year_business_survival_rates_started_in_2013"                                             
## [54] "crime_rates_per_thousand_population_2014_15"                                                  
## [55] "fires_per_thousand_population_2014"                                                           
## [56] "ambulance_incidents_per_hundred_population_2014"                                              
## [57] "median_house_price_2015"                                                                      
## [58] "average_band_d_council_tax_charge__2015_16"                                                   
## [59] "new_homes_net_2015_16_provisional"                                                            
## [60] "homes_owned_outright_2014_pc"                                                                 
## [61] "being_bought_with_mortgage_or_loan_2014_pc"                                                   
## [62] "rented_from_local_authority_or_housing_association_2014_pc"                                   
## [63] "rented_from_private_landlord_2014_pc"                                                         
## [64] "pc_of_area_that_is_greenspace_2005"                                                           
## [65] "total_carbon_emissions_2014"                                                                  
## [66] "household_waste_recycling_rate_2014_15"                                                       
## [67] "number_of_cars_2011_census"                                                                   
## [68] "number_of_cars_per_household_2011_census"                                                     
## [69] "pc_of_adults_who_cycle_at_least_once_per_month_2014_15"                                       
## [70] "average_public_transport_accessibility_score_2014"                                            
## [71] "achievement_of_5_or_more_a_c_grades_at_gcse_or_equivalent_including_english_and_maths_2013_14"
## [72] "rates_of_children_looked_after_2016"                                                          
## [73] "pc_of_pupils_whose_first_language_is_not_english_2015"                                        
## [74] "pc_children_living_in_out_of_work_households_2015"                                            
## [75] "male_life_expectancy_2012_14"                                                                 
## [76] "female_life_expectancy_2012_14"                                                               
## [77] "teenage_conception_rate_2014"                                                                 
## [78] "life_satisfaction_score_2011_14_out_of_10"                                                    
## [79] "worthwhileness_score_2011_14_out_of_10"                                                       
## [80] "happiness_score_2011_14_out_of_10"                                                            
## [81] "anxiety_score_2011_14_out_of_10"                                                              
## [82] "childhood_obesity_prevalance_pc_2015_16"                                                      
## [83] "people_aged_17_with_diabetes_pc"                                                              
## [84] "mortality_rate_from_causes_considered_preventable_2012_14"                                    
## [85] "political_control_in_council"                                                                 
## [86] "proportion_of_seats_won_by_conservatives_in_2014_election"                                    
## [87] "proportion_of_seats_won_by_labour_in_2014_election"                                           
## [88] "proportion_of_seats_won_by_lib_dems_in_2014_election"                                         
## [89] "turnout_at_2014_local_elections"                                                              
## [90] "geometry"
lnd_boro = lnd_boro %>% select(gss, name, population_density_per_hectare_2017, average_age_2017, pc_of_resident_population_born_abroad_2015, employment_rate_pc_2015, modelled_household_median_income_estimates_2012_13, proportion_of_seats_won_by_conservatives_in_2014_election, proportion_of_seats_won_by_labour_in_2014_election, median_house_price_2015) %>% 
  rename(id = gss,
         pop_dens = population_density_per_hectare_2017,
         average_age = average_age_2017,
         pop_born_abroad = pc_of_resident_population_born_abroad_2015,
         employment = employment_rate_pc_2015,
         household_income=modelled_household_median_income_estimates_2012_13,
         conservative_seats=proportion_of_seats_won_by_conservatives_in_2014_election,
         labour_seats=proportion_of_seats_won_by_labour_in_2014_election,
         house_price=median_house_price_2015)
plot(lnd_boro %>% select(-id,-name), border=NA)

# integrate missing data
# from https://data.gov.uk/dataset/248f5f04-23cf-4470-9216-0d0be9b877a8/london-borough-profiles-and-atlas/datafile/1e5d5226-a1a7-4097-afbf-d3bd39226c39/preview
#lnd_boro[lnd_boro$name == "Kensington and Chelsea", "gross_annual_pay_2016"] = 63620

Find problematic variables.

# get correlations
lnd_boro_corr = lnd_boro %>% st_drop_geometry() %>% select(-name,-id) %>% correlate(method='kendall') %>% stretch()
## 
## Correlation method: 'kendall'
## Missing treated using: 'pairwise.complete.obs'
# observe high correlations
lnd_boro_corr %>% filter(r > .5 | r < -.5)
## # A tibble: 10 x 3
##    x                  y                       r
##    <chr>              <chr>               <dbl>
##  1 pop_dens           average_age        -0.514
##  2 average_age        pop_dens           -0.514
##  3 average_age        conservative_seats  0.586
##  4 average_age        labour_seats       -0.627
##  5 household_income   conservative_seats  0.501
##  6 conservative_seats average_age         0.586
##  7 conservative_seats household_income    0.501
##  8 conservative_seats labour_seats       -0.697
##  9 labour_seats       average_age        -0.627
## 10 labour_seats       conservative_seats -0.697
# remove Labour variable
lnd_boro = lnd_boro %>% select(-labour_seats)
lnd_boro_desc_stats = t(stat.desc(lnd_boro %>% st_drop_geometry() %>% select(-id,-name), norm=T))
lnd_boro_desc_stats
##                    nbr.val nbr.null nbr.na          min          max
## pop_dens                33        0      0     21.84036     155.6205
## average_age             33        0      0     31.40000      43.2000
## pop_born_abroad         32        0      1     10.90000      54.1000
## employment              33        0      0     64.60000      79.6000
## household_income        33        0      0  28780.00000   63620.0000
## conservative_seats      32        5      1      0.00000      85.0000
## house_price             33        0      0 243500.00000 1200000.0000
##                          range          sum       median         mean
## pop_dens              133.7801     2457.754     59.17539     74.47740
## average_age            11.8000     1200.400     36.20000     36.37576
## pop_born_abroad        43.2000     1168.400     36.90000     36.51250
## employment             15.0000     2399.600     73.10000     72.71515
## household_income    34840.0000  1308190.000  37040.00000  39642.12121
## conservative_seats     85.0000     1046.898     30.00000     32.71556
## house_price        956500.0000 15360443.000 410000.00000 465467.96970
##                         SE.mean CI.mean.0.95          var      std.dev
## pop_dens           6.856818e+00    13.966880 1.551526e+03 3.938942e+01
## average_age        4.330790e-01     0.882153 6.189394e+00 2.487849e+00
## pop_born_abroad    1.855380e+00     3.784072 1.101579e+02 1.049561e+01
## employment         7.345005e-01     1.496128 1.780320e+01 4.219384e+00
## household_income   1.287259e+03  2622.061541 5.468221e+07 7.394742e+03
## conservative_seats 4.776588e+00     9.741915 7.301052e+02 2.702046e+01
## house_price        3.557386e+04 72461.579214 4.176148e+10 2.043563e+05
##                      coef.var   skewness   skew.2SE   kurtosis    kurt.2SE
## pop_dens           0.52887744  0.5506993  0.6738271 -0.9455118 -0.59211886
## average_age        0.06839306  0.4105474  0.5023395  0.2371737  0.14852807
## pop_born_abroad    0.28745261 -0.4072860 -0.4913485 -0.1454105 -0.08982929
## employment         0.05802620 -0.2030418 -0.2484388 -1.0326806 -0.64670761
## household_income   0.18653750  1.3316678  1.6294081  1.8164895  1.13756141
## conservative_seats 0.82592087  0.3562270  0.4297511 -1.2819967 -0.79197072
## house_price        0.43903399  1.8238259  2.2316052  3.3048996  2.06966585
##                    normtest.W   normtest.p
## pop_dens            0.9229907 2.219048e-02
## average_age         0.9747445 6.211098e-01
## pop_born_abroad     0.9603938 2.815670e-01
## employment          0.9631147 3.158213e-01
## household_income    0.8804765 1.723490e-03
## conservative_seats  0.9128969 1.340429e-02
## house_price         0.7935325 2.435519e-05
# histograms to see normality
ggplot(gather(lnd_boro %>% st_drop_geometry() %>% select(-name,-id)), aes(value)) + 
    geom_histogram(bins = 20) + 
    facet_wrap(~key, scales = 'free_x')
## Warning: Removed 2 rows containing non-finite values (stat_bin).

The only variable whose skewness is a concern is median_house_price_2015. It will be normalised with log10.

# normalise and scale
lnd_boro$house_price = log10(lnd_boro$house_price+1)
lnd_boro$household_income = log10(lnd_boro$household_income+1)

# skewness is now acceptable
# histograms to see normality
ggplot(gather(lnd_boro %>% st_drop_geometry() %>% select(-name,-id)), aes(value)) + 
    geom_histogram(bins = 20) + 
    facet_wrap(~key, scales = 'free_x')
## Warning: Removed 2 rows containing non-finite values (stat_bin).

Multidimensional scaling of boroughs.

mds <- lnd_boro %>% select(-name,-id) %>% st_drop_geometry() %>% 
  mutate_if(is.numeric, scale) %>% dist() %>% cmdscale() %>% as_tibble()
colnames(mds) <- c("Dim1", "Dim2")
mds$name = lnd_boro$name

# plot MDS
ggplot(mds, aes(x=Dim1, y=Dim2)) +
  ggtitle("Multi-dimensional scaling of Greater London boroughs") + 
  geom_point(size=.2) +
  geom_text(
    label=mds$name, 
    size=2,
    #nudge_x = .25, 
    nudge_y = .1, 
    check_overlap = F
  ) + theme_bw()

rm(mds)

4.2.2 Calc uniqueness

Calculate uniqueness of boroughs based on all variables.

# 'euclidean','manhattan', 'minkowski', mahalanobis'

# filter out very rare
#lnd_boro = lnd_boro %>% filter(!(id %in% c('E09000001','E09000020')))

method = 'euclidean'
lnd_boro_uniq = dist_uniq(lnd_boro %>% st_drop_geometry(), method)
## [1] "dist_uniq 33 9 euclidean"
## [1] "scaling data..."
#lnd_boro_u_man = dist_uniq(lnd_boro %>% st_drop_geometry(), 'manhattan')
#lnd_boro_u_mink = dist_uniq(lnd_boro %>% st_drop_geometry(), 'minkowski')
#lnd_boro_u_maha = dist_uniq(lnd_boro %>% st_drop_geometry(), 'mahalanobis', impute_na = T)

openxlsx::write.xlsx(lnd_boro_uniq %>% mutate_if(is.numeric, round, 4),
                     paste0('analysis/london_boroughs_uniqueness-',method,'.xlsx'))

4.2.3 Analyse and viz

# bar chart
#lnd_boro_uniq %>% select(sum_dists_class) %>% plot()
# add geometries
lnd_boro_uniq_geo = merge(lnd_boro %>% select(id), lnd_boro_uniq)

# maps
plot_uniq_map(lnd_boro_uniq_geo)
## tmap mode set to plotting

## Variable(s) "sum_dists_z" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

# heatmaps
gen_variable_heatmap(lnd_boro_uniq_geo, T, 50)
## # A tibble: 297 x 3
##    row_id                 col_name           value[,1]
##    <chr>                  <chr>                  <dbl>
##  1 City of London         pop_dens           -1.12    
##  2 City of London         average_age         2.74    
##  3 City of London         pop_born_abroad    NA       
##  4 City of London         employment         -1.92    
##  5 City of London         household_income    2.84    
##  6 City of London         conservative_seats NA       
##  7 City of London         house_price         1.69    
##  8 City of London         sum_dists_z         3.43    
##  9 City of London         sum_dists_z_p       0.000297
## 10 Kensington and Chelsea pop_dens            1.44    
## # … with 287 more rows
## Warning: Removed 2 rows containing missing values (geom_text).

gen_variable_heatmap(lnd_boro_uniq_geo, F, 50)
## # A tibble: 297 x 3
##    row_id         col_name           value[,1]
##    <chr>          <chr>                  <dbl>
##  1 Waltham Forest pop_dens             -0.0839
##  2 Waltham Forest average_age          -0.513 
##  3 Waltham Forest pop_born_abroad       0.0655
##  4 Waltham Forest employment            0.0912
##  5 Waltham Forest household_income     -0.964 
##  6 Waltham Forest conservative_seats   -0.224 
##  7 Waltham Forest house_price          -0.463 
##  8 Waltham Forest sum_dists_z          -1.03  
##  9 Waltham Forest sum_dists_z_p         0.849 
## 10 Greenwich      pop_dens             -0.388 
## # … with 287 more rows
## Warning: Removed 2 rows containing missing values (geom_text).

# choropleth
# Based on <https://mharinga.github.io/choropleth.html> (accessed in May 2022).
#u_colors = rev(toupper(c('#f0f9e8','#bae4bc','#7bccc4','#43a2ca','#0868ac')))
#lnd_boro_u_geo %>% select(sum_dists_class) %>% plot(border = 'lightgray', 
#        pal = u_colors,
#        main='Uniqueness of London boroughs')

4.3 Countries (World Bank data)

Calculate uniqueness of countries based on World Bank indicators (2019).

4.3.1 Prep data

Select countries with at least 1 million people:

wb_geo = read_sf("data/world_bank/indicators_by_year/world_bank_indicators_countries-2019.geojson") %>% filter(!is.na(iso3c) & sp.pop.totl >= 1e6 & iso_a3 != '-99')
names(wb_geo)
##  [1] "iso_a2"            "iso_a3"            "name"             
##  [4] "name_long"         "abbrev"            "type"             
##  [7] "continent"         "region_un"         "subregion"        
## [10] "region_wb"         "un_a3"             "wb_a2"            
## [13] "wb_a3"             "iso3c"             "country"          
## [16] "date"              "it.net.user.zs"    "ms.mil.xpnd.gd.zs"
## [19] "ne.trd.gnfs.zs"    "ny.gdp.mktp.cd"    "ny.gdp.mktp.pp.cd"
## [22] "sl.isv.ifrm.zs"    "sp.pop.grow"       "sp.pop.totl"      
## [25] "sp.rur.totl.zs"    "sp.urb.totl.in.zs" "region"           
## [28] "income_group"      "eu"                "geometry"
nrow(wb_geo)
## [1] 157
stopifnot(is_unique(wb_geo$iso_a3))
stopifnot(is_unique(wb_geo$iso_a2))

# use Winkel Tripel for visualisations
crs_wintri <- "+proj=wintri +datum=WGS84 +no_defs +over"
wb_geo <- st_transform(wb_geo, crs = crs_wintri)

Select and rename variables.

wb_tib = wb_geo %>% st_drop_geometry() %>% rename(
  population = sp.pop.totl,
  pop_growth = sp.pop.grow,
  urban_pop = sp.urb.totl.in.zs,
  gdp = ny.gdp.mktp.pp.cd,
  gdp_trade = ne.trd.gnfs.zs,
  internet_access = it.net.user.zs,
  military_exp = ms.mil.xpnd.gd.zs
) %>% mutate(pc_gdp = gdp / population) %>%
select(iso_a2, iso_a3, name, region_un, population, pop_growth, gdp, pc_gdp,
             urban_pop, internet_access, military_exp) # income_group, 

wb_tib = wb_tib 

wb_tib %>% sample_n(10)
## # A tibble: 10 x 11
##    iso_a2 iso_a3 name   region_un population pop_growth     gdp pc_gdp urban_pop
##    <chr>  <chr>  <chr>  <chr>          <dbl>      <dbl>   <dbl>  <dbl>     <dbl>
##  1 ID     IDN    Indon… Asia       270625568      1.10  3.34e12 12335.      56.0
##  2 BA     BIH    Bosni… Europe       3301000     -0.692 5.24e10 15883.      48.6
##  3 IE     IRL    Irela… Europe       4941444      1.51  4.36e11 88241.      63.4
##  4 IR     IRN    Iran   Asia        82913906      1.35  1.07e12 12937.      75.4
##  5 TZ     TZA    Tanza… Africa      58005463      2.95  1.56e11  2697.      34.5
##  6 AE     ARE    Unite… Asia         9770529      1.44  6.85e11 70089.      86.8
##  7 SL     SLE    Sierr… Africa       7813215      2.11  1.40e10  1794.      42.5
##  8 NA     NAM    Namib… Africa       2494530      1.87  2.51e10 10064.      51.0
##  9 NI     NIC    Nicar… Americas     6545502      1.23  3.70e10  5646.      58.8
## 10 EE     EST    Eston… Europe       1326590      0.348 5.16e10 38915.      69.1
## # … with 2 more variables: internet_access <dbl>, military_exp <dbl>

Transform problematic variables:

# get correlations
wb_corr = wb_tib %>% select_if(is.numeric) %>% correlate(method='kendall', use="na.or.complete") %>% stretch()
## 
## Correlation method: 'kendall'
## Missing treated using: 'na.or.complete'
# observe high correlations
wb_corr %>% filter(r > .5 | r < -.5)
## # A tibble: 8 x 3
##   x               y                   r
##   <chr>           <chr>           <dbl>
## 1 population      gdp             0.510
## 2 gdp             population      0.510
## 3 pc_gdp          urban_pop       0.553
## 4 pc_gdp          internet_access 0.794
## 5 urban_pop       pc_gdp          0.553
## 6 urban_pop       internet_access 0.554
## 7 internet_access pc_gdp          0.794
## 8 internet_access urban_pop       0.554
# histograms to see normality
ggplot(gather(wb_tib %>% select_if(is.numeric)), aes(value)) + 
    geom_histogram(bins = 20) + 
    facet_wrap(~key, scales = 'free_x')
## Warning: Removed 36 rows containing non-finite values (stat_bin).

The only variables whose skewness are a serious concern are population and gdp. They will be normalised with log10.

# transform and scale
wb_trans_tib = wb_tib %>% mutate( gdp = log10(gdp+1), 
                                population = log10(population+1),
                                pc_gdp = log10(pc_gdp+1),
                                military_exp = log10(military_exp+1))

ggplot(gather(wb_trans_tib %>% select_if(is.numeric)), aes(value)) + 
    geom_histogram(bins = 20) + 
    facet_wrap(~key, scales = 'free_x')
## Warning: Removed 36 rows containing non-finite values (stat_bin).

Multi-dimensional scaling of countries to show clustering:

mds <- wb_trans_tib %>% select(-iso_a2,-name) %>% 
  mutate_if(is.numeric, scale) %>%
  dist() %>% cmdscale() %>% as_tibble()
## Warning in dist(.): NAs introduced by coercion
colnames(mds) <- c("Dim1", "Dim2")
mds$name = wb_trans_tib$iso_a3
mds$region_un = wb_trans_tib$region_un

# plot MDS
ggplot(mds, aes(x=Dim1, y=Dim2, color=region_un)) +
  ggtitle("Multi-dimensional scaling of countries") + 
  #geom_point(size=0) +
  geom_text(
    label = mds$name, 
    size = 2,
    #nudge_x = .25, nudge_y = .25
    #check_overlap = T
  ) + theme_bw()

rm(mds)

4.3.2 Calc uniqueness

#meth = 'mahalanobis'
meth = 'manhattan'
#meth = 'euclidean'
filt_wb_trans_tib = wb_trans_tib %>% select(-population,-gdp)
  
wb_uniq = dist_uniq(filt_wb_trans_tib, meth, impute_na = T)
## [1] "dist_uniq 157 9 manhattan"
## [1] "imputing missing values..."
## [1] "scaling data..."
summary(wb_uniq)
##     iso_a2             iso_a3              name            region_un        
##  Length:157         Length:157         Length:157         Length:157        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     pop_growth.V1          pc_gdp.V1           urban_pop.V1    
##  Min.   :-2.7301313   Min.   :-2.3996204   Min.   :-2.1146868  
##  1st Qu.:-0.7484615   1st Qu.:-0.7282080   1st Qu.:-0.7710306  
##  Median : 0.0270864   Median : 0.0835194   Median : 0.0424196  
##  Mean   : 0.0000000   Mean   : 0.0000000   Mean   : 0.0000000  
##  3rd Qu.: 0.7608670   3rd Qu.: 0.8291936   3rd Qu.: 0.8433711  
##  Max.   : 2.7727087   Max.   : 1.8291785   Max.   : 1.7855400  
##   internet_access.V1    military_exp.V1     sum_dists      sum_dists_dec   
##  Min.   :-1.8469114   Min.   :-2.418894   Min.   : 669.1   Min.   : 1.000  
##  1st Qu.:-0.9510277   1st Qu.:-0.531860   1st Qu.: 779.1   1st Qu.: 3.000  
##  Median : 0.2865862   Median :-0.084567   Median : 854.3   Median : 5.000  
##  Mean   : 0.0000000   Mean   : 0.000000   Mean   : 885.0   Mean   : 5.433  
##  3rd Qu.: 0.8885776   3rd Qu.: 0.412638   3rd Qu.: 961.8   3rd Qu.: 8.000  
##  Max.   : 1.4961384   Max.   : 3.388462   Max.   :1361.3   Max.   :10.000  
##   sum_dists_z      sum_dists_z_p          sum_dists_class sum_dists_p_thresh
##  Min.   :-1.4363   Min.   :0.0007658   very rare  :  2    ***:  2           
##  1st Qu.:-0.7043   1st Qu.:0.3046804   rare       :  2    ** :  2           
##  Median :-0.2043   Median :0.5809292   medium     :  7    *  :  7           
##  Mean   : 0.0000   Mean   :0.5255461   common     :  7    .  :  7           
##  3rd Qu.: 0.5110   3rd Qu.:0.7593628   very common:139       :139           
##  Max.   : 3.1686   Max.   :0.9245428                                        
##  dist_method       
##  Length:157        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
openxlsx::write.xlsx(wb_uniq %>% mutate_if(is.numeric, round, 4),
                     paste0('analysis/wb_countries_uniqueness-',method,'.xlsx'))

wb_uniq %>% head(20)
## # A tibble: 20 x 16
##    iso_a2 iso_a3 name          region_un pop_growth[,1] pc_gdp[,1] urban_pop[,1]
##    <chr>  <chr>  <chr>         <chr>              <dbl>      <dbl>         <dbl>
##  1 BI     BDI    Burundi       Africa             1.60     -2.40          -2.11 
##  2 NE     NER    Niger         Africa             2.18     -1.98          -1.97 
##  3 BH     BHR    Bahrain       Asia               2.77      1.16           1.31 
##  4 OM     OMN    Oman          Asia               1.46      0.724          1.13 
##  5 KW     KWT    Kuwait        Asia               0.320     1.25           1.79 
##  6 SA     SAU    Saudi Arabia  Asia               0.320     1.20           1.07 
##  7 MW     MWI    Malawi        Africa             1.17     -2.10          -1.94 
##  8 CD     COD    Dem. Rep. Co… Africa             1.65     -2.07          -0.685
##  9 TD     TCD    Chad          Africa             1.47     -1.75          -1.66 
## 10 PG     PNG    Papua New Gu… Oceania            0.567    -0.873         -2.11 
## 11 UG     UGA    Uganda        Africa             1.97     -1.47          -1.62 
## 12 MD     MDA    Moldova       Europe            -2.73      0.0816        -0.789
## 13 MZ     MOZ    Mozambique    Africa             1.40     -1.94          -1.07 
## 14 ET     ETH    Ethiopia      Africa             1.12     -1.46          -1.76 
## 15 MG     MDG    Madagascar    Africa             1.18     -1.72          -1.01 
## 16 SG     SGP    Singapore     Asia              -0.143     1.83           1.79 
## 17 IL     ISR    Israel        Asia               0.521     1.06           1.45 
## 18 LR     LBR    Liberia       Africa             0.982    -1.84          -0.390
## 19 TG     TGO    Togo          Africa             0.976    -1.75          -0.811
## 20 BF     BFA    Burkina Faso  Africa             1.35     -1.47          -1.36 
## # … with 9 more variables: internet_access <dbl[,1]>, military_exp <dbl[,1]>,
## #   sum_dists <dbl>, sum_dists_dec <int>, sum_dists_z <dbl>,
## #   sum_dists_z_p <dbl>, sum_dists_class <fct>, sum_dists_p_thresh <fct>,
## #   dist_method <chr>

4.3.3 Analyse and viz

Based on https://r-spatial.github.io/sf/articles/sf5.html

# correlations
corr_tib = wb_uniq %>% select_if(is.numeric) %>% correlate(method='kendall') %>% stretch() %>% filter(x=='sum_dists') %>% arrange(-r)
## 
## Correlation method: 'kendall'
## Missing treated using: 'pairwise.complete.obs'
# plot distributions
ggplot(gather(wb_uniq %>% select_if(is.numeric) %>% 
                select(sum_dists, sum_dists_z, sum_dists_z_p)), 
    aes(value)) + 
    geom_histogram(bins = 20) + 
    facet_wrap(~key, scales = 'free_x')

# data for maps
wb_uniq_geo = merge(wb_geo %>% select(iso_a3), wb_uniq, by='iso_a3')

Generate world maps with uniqueness information:

wb_uniq_geo <- st_transform(wb_uniq_geo, crs = crs_wintri)
plot_uniq_map(wb_uniq_geo)
## tmap mode set to plotting

## Variable(s) "sum_dists_z" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

gen_variable_heatmap(wb_uniq_geo)
## # A tibble: 210 x 3
##    row_id  col_name        value[,1]
##    <chr>   <chr>               <dbl>
##  1 Burundi pop_growth       1.60    
##  2 Burundi pc_gdp          -2.40    
##  3 Burundi urban_pop       -2.11    
##  4 Burundi internet_access -1.76    
##  5 Burundi military_exp     0.174   
##  6 Burundi sum_dists_z      3.17    
##  7 Burundi sum_dists_z_p    0.000766
##  8 Niger   pop_growth       2.18    
##  9 Niger   pc_gdp          -1.98    
## 10 Niger   urban_pop       -1.97    
## # … with 200 more rows

gen_variable_heatmap(wb_uniq_geo, F)
## # A tibble: 210 x 3
##    row_id   col_name        value[,1]
##    <chr>    <chr>               <dbl>
##  1 Bolivia  pop_growth         0.0801
##  2 Bolivia  pc_gdp            -0.269 
##  3 Bolivia  urban_pop          0.427 
##  4 Bolivia  internet_access   -0.377 
##  5 Bolivia  military_exp      -0.164 
##  6 Bolivia  sum_dists_z       -1.44  
##  7 Bolivia  sum_dists_z_p      0.925 
##  8 Paraguay pop_growth        -0.0373
##  9 Paraguay pc_gdp             0.0570
## 10 Paraguay urban_pop          0.0716
## # … with 200 more rows

4.4 EU (Eurostat data)

4.4.1 Prep data

# load EU data and remove French Guyana for cartography
eu_geo = read_sf("data/eurostat/eu_vars_with_bounds/eu_nuts_2_eurostat_2018.geojson") %>%
  filter(!(NUTS_ID %in% c('FRY3','FRY2','FRY1','FRY4','FRY5')))

summary(eu_geo)
##    NUTS_ID               id             CNTR_CODE          NUTS_NAME        
##  Length:327         Length:327         Length:327         Length:327        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    LEVL_CODE     FID                geo            broadband_access_household
##  Min.   :2   Length:327         Length:327         Min.   : 84.0             
##  1st Qu.:2   Class :character   Class :character   1st Qu.: 97.0             
##  Median :2   Mode  :character   Mode  :character   Median : 99.0             
##  Mean   :2                                         Mean   : 97.5             
##  3rd Qu.:2                                         3rd Qu.: 99.0             
##  Max.   :2                                         Max.   :100.0             
##                                                    NA's   :152               
##  disp_income_household gender_employ_gap life_expectancy_f life_expectancy_m
##  Min.   :12300         Min.   : 0.70     Min.   :77.30     Min.   :70.1     
##  1st Qu.:16125         1st Qu.: 7.90     1st Qu.:82.00     1st Qu.:76.4     
##  Median :17450         Median :10.20     Median :83.50     Median :79.0     
##  Mean   :18742         Mean   :13.49     Mean   :83.33     Mean   :78.2     
##  3rd Qu.:20075         3rd Qu.:15.75     3rd Qu.:84.90     3rd Qu.:80.4     
##  Max.   :47000         Max.   :48.80     Max.   :88.10     Max.   :83.0     
##  NA's   :277           NA's   :4         NA's   :4         NA's   :4        
##  life_expectancy_t   region_gdp     region_population  unemployment_rate_f
##  Min.   :73.60     Min.   :  1365   Min.   :   29489   Min.   : 1.600     
##  1st Qu.:79.30     1st Qu.: 16739   1st Qu.:  940271   1st Qu.: 3.500     
##  Median :81.50     Median : 34901   Median : 1506232   Median : 5.450     
##  Mean   :80.79     Mean   : 53363   Mean   : 1889104   Mean   : 7.891     
##  3rd Qu.:82.60     3rd Qu.: 65104   3rd Qu.: 2310766   3rd Qu.: 9.350     
##  Max.   :85.50     Max.   :733875   Max.   :15029231   Max.   :36.300     
##  NA's   :4         NA's   :17                          NA's   :17         
##  unemployment_rate_m unemployment_rate_t          geometry  
##  Min.   : 0.800      Min.   : 1.300      MULTIPOLYGON :327  
##  1st Qu.: 3.800      1st Qu.: 3.600      epsg:4326    :  0  
##  Median : 5.300      Median : 5.400      +proj=long...:  0  
##  Mean   : 6.745      Mean   : 7.064                         
##  3rd Qu.: 8.200      3rd Qu.: 8.675                         
##  Max.   :22.900      Max.   :29.000                         
##  NA's   :14          NA's   :5
# project data to ETRS 89 LAEA, suitable for Europe
eu_geo <- eu_geo %>% st_transform(3035)

# get more variables
#eu_tib = read_tsv('data/eurostat/eu_all_geo_data.tsv')
#summary(eu_tib)

eu_geo = eu_geo %>% select(NUTS_ID, CNTR_CODE, NUTS_NAME, region_population, life_expectancy_t, region_gdp, unemployment_rate_t) %>% mutate(pc_gdp = region_gdp/region_population*1e6)

eu_geo %>% sample_n(10)
## Simple feature collection with 10 features and 8 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: 3705361 ymin: 1688237 xmax: 6723510 ymax: 3724038
## projected CRS:  ETRS89-extended / LAEA Europe
## # A tibble: 10 x 9
##    NUTS_ID CNTR_CODE NUTS_NAME      region_populati… life_expectancy… region_gdp
##  * <chr>   <chr>     <chr>                     <dbl>            <dbl>      <dbl>
##  1 TR72    TR        Kayseri, Siva…          2416673             78.7     14537.
##  2 FRE2    FR        Picardie                1928795             81.2     48158.
##  3 PL84    PL        Podlaskie               1154856             78.2     10868.
##  4 TR83    TR        Samsun, Tokat…          2773386             78.9     14212.
##  5 TR32    TR        Aydin, Denizl…          3038325             79.6     21353.
##  6 BE23    BE        Prov. Oost-Vl…          1506232             82.3     56415.
##  7 TR42    TR        Kocaeli, Saka…          3805481             78.5     41346.
##  8 DE22    DE        Niederbayern            1230037             81.1     48466.
##  9 BG42    BG        Южен централен          1417432             75.5      7943.
## 10 SE22    SE        Sydsverige              1504060             82.5     59578.
## # … with 3 more variables: unemployment_rate_t <dbl>,
## #   geometry <MULTIPOLYGON [m]>, pc_gdp <dbl>

Transform variables

eu_geo %>% select_if(is.numeric) %>% st_drop_geometry() %>% correlate(method='kendall', use="na.or.complete") %>% stretch() %>% arrange(-r)
## 
## Correlation method: 'kendall'
## Missing treated using: 'na.or.complete'
## # A tibble: 25 x 3
##    x                   y                       r
##    <chr>               <chr>               <dbl>
##  1 region_population   region_gdp          0.509
##  2 region_gdp          region_population   0.509
##  3 region_gdp          pc_gdp              0.472
##  4 pc_gdp              region_gdp          0.472
##  5 life_expectancy_t   pc_gdp              0.384
##  6 pc_gdp              life_expectancy_t   0.384
##  7 life_expectancy_t   region_gdp          0.287
##  8 region_gdp          life_expectancy_t   0.287
##  9 life_expectancy_t   unemployment_rate_t 0.130
## 10 unemployment_rate_t life_expectancy_t   0.130
## # … with 15 more rows
# plot distributions
ggplot(gather(eu_geo %>% st_drop_geometry() %>% select_if(is.numeric)), 
    aes(value)) + 
    geom_histogram(bins = 20) + 
    facet_wrap(~key, scales = 'free_x')
## Warning: Removed 43 rows containing non-finite values (stat_bin).

# transform 
eu_trans_geo = eu_geo %>% mutate(region_population = log10(region_population+1),
  region_gdp = log10(region_gdp+1),
  pc_gdp = log10(pc_gdp+1),
  unemployment_rate_t = log10(unemployment_rate_t+1))

ggplot(gather(eu_trans_geo %>% st_drop_geometry() %>% select_if(is.numeric)), 
    aes(value)) + 
    geom_histogram(bins = 20) + 
    facet_wrap(~key, scales = 'free_x')
## Warning: Removed 43 rows containing non-finite values (stat_bin).

4.4.2 Calc uniqueness

# select variables for uniqueness
eu_trans_geo_filt = eu_trans_geo %>% select(-region_gdp)

#method = 'euclidean'
method = 'mahalanobis'
eu_uniq = dist_uniq(eu_trans_geo_filt %>% st_drop_geometry(), method)
## [1] "dist_uniq 327 7 mahalanobis"
## [1] "scaling data..."
## Warning in dist_uniq(eu_trans_geo_filt %>% st_drop_geometry(), method):
## distances could not be calculated for 22 cases. Set impute_na to TRUE to impute
## missing values.
openxlsx::write.xlsx(eu_uniq %>% mutate_if(is.numeric, round, 4),
                     paste0('analysis/eu_nuts2_uniqueness-',method,'.xlsx'))

eu_uniq %>% head(20) # %>% View(.)
## # A tibble: 20 x 14
##    NUTS_ID CNTR_CODE NUTS_NAME               region_populatio… life_expectancy_…
##    <chr>   <chr>     <chr>                               <dbl>             <dbl>
##  1 ES63    ES        Ciudad Autónoma de Ceu…           -3.39              0.287 
##  2 ES64    ES        Ciudad Autónoma de Mel…           -3.39             -0.0341
##  3 UKI3    UK        Inner London - West               -0.216             1.57  
##  4 BE10    BE        Région de Bruxelles-Ca…           -0.183             0.287 
##  5 BG31    BG        Северозападен                     -0.748            -2.88  
##  6 TRC2    TR        Sanliurfa, Diyarbakir              1.17             -0.555 
##  7 FR10    FR        Ile-de-France                      2.62              1.49  
##  8 TRA2    TR        Agri, Kars, Igdir, Ard…           -0.277            -1.12  
##  9 EL41    EL        Βόρειο Αιγαίο                     -2.29              0.848 
## 10 TR10    TR        Istanbul                           2.87             -0.555 
## 11 ES61    ES        Andalucía                          2.17              0.527 
## 12 EL53    EL        Δυτική Μακεδονία                  -2.00              0.888 
## 13 LV00    LV        Latvija                            0.388            -2.28  
## 14 TR90    TR        Trabzon                            0.761            -0.0742
## 15 TRB2    TR        Van, Mus, Bitlis, Hakk…            0.504            -0.956 
## 16 ITC2    IT        Valle d'Aosta/Vallée d…           -2.91              0.607 
## 17 EL54    EL        Ήπειρος                           -1.73              1.13  
## 18 TRB1    TR        Malatya, Elazig, Bingö…            0.251            -0.275 
## 19 LU00    LU        Luxembourg                        -1.02              0.607 
## 20 RS22    RS        Регион Јужне и Источне…            0.0917           -2.20  
## # … with 9 more variables: unemployment_rate_t <dbl[,1]>, pc_gdp <dbl[,1]>,
## #   sum_dists <dbl>, sum_dists_dec <int>, sum_dists_z <dbl>,
## #   sum_dists_z_p <dbl>, sum_dists_class <fct>, sum_dists_p_thresh <fct>,
## #   dist_method <chr>

4.4.3 Analyse and viz

Maps and heatmaps.

# data
eu_uniq_geo = merge(eu_geo %>% select(NUTS_ID), eu_uniq, by='NUTS_ID')
eu_uniq_geo$name = paste(eu_uniq_geo$CNTR_CODE, '-', eu_uniq_geo$NUTS_NAME)

# maps
plot_uniq_map(eu_uniq_geo)
## tmap mode set to plotting

## Variable(s) "sum_dists_z" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

# heatmaps
gen_variable_heatmap(eu_uniq_geo)
## # A tibble: 180 x 3
##    row_id                          col_name            value[,1]
##    <chr>                           <chr>                   <dbl>
##  1 ES - Ciudad Autónoma de Ceuta   region_population    -3.39e+0
##  2 ES - Ciudad Autónoma de Ceuta   life_expectancy_t     2.87e-1
##  3 ES - Ciudad Autónoma de Ceuta   unemployment_rate_t   2.71e+0
##  4 ES - Ciudad Autónoma de Ceuta   pc_gdp               -1.40e-1
##  5 ES - Ciudad Autónoma de Ceuta   sum_dists_z           5.69e+0
##  6 ES - Ciudad Autónoma de Ceuta   sum_dists_z_p         6.25e-9
##  7 ES - Ciudad Autónoma de Melilla region_population    -3.39e+0
##  8 ES - Ciudad Autónoma de Melilla life_expectancy_t    -3.41e-2
##  9 ES - Ciudad Autónoma de Melilla unemployment_rate_t   2.50e+0
## 10 ES - Ciudad Autónoma de Melilla pc_gdp               -2.53e-1
## # … with 170 more rows

gen_variable_heatmap(eu_uniq_geo, F)
## # A tibble: 180 x 3
##    row_id                 col_name            value[,1]
##    <chr>                  <chr>                   <dbl>
##  1 UK - South Yorkshire   region_population    -0.00487
##  2 UK - South Yorkshire   life_expectancy_t    -0.235  
##  3 UK - South Yorkshire   unemployment_rate_t  -0.247  
##  4 UK - South Yorkshire   pc_gdp                0.161  
##  5 UK - South Yorkshire   sum_dists_z          -1.20   
##  6 UK - South Yorkshire   sum_dists_z_p         0.886  
##  7 SI - Vzhodna Slovenija region_population    -0.305  
##  8 SI - Vzhodna Slovenija life_expectancy_t    -0.114  
##  9 SI - Vzhodna Slovenija unemployment_rate_t  -0.129  
## 10 SI - Vzhodna Slovenija pc_gdp               -0.278  
## # … with 170 more rows

End of notebook